The objective of this project is to implement an AI to predict the price of bitcoin (BTC) in Tether (USDT) and etherum (ETH). In this work we are not interested in the exact exchange rates between bitcoin and these two other cryptocurrencies, but rather in their fluctuations i.e.: the high price, the low price, the average price and the median price over the next 5 minutes.
To carry out this project, we used two data sets. The first collects information about the price of bitcoin in USDT over a period of 3 days (from ... to ...) while the second contains the same type of information but for the price ratio between ehtereum and bitcoin and over a slightly longer period from ... to ..... This data is collected at irregular intervals in miliseconds.
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import matplotlib.dates as mdates
import statsmodels.api as sm
import statsmodels.tsa as st
import tensorflow as tf
from tensorflow.keras.layers import Dense, LSTM, Activation, Dropout
from keras.models import Sequential
from tqdm.auto import tqdm
# data on the price ratio between ETH and BTC
ethbtc=pd.read_json(r'ETHBTC_240000000_5000000.json')
# data on the price ratio between BTC and USDT
btcusdt=pd.read_json(r'BTCUSDT_690000000_5000000.json')
# This script allows to format the display in some cells of the notebook
from IPython.display import display, HTML
css = """
div.cell:nth-child(13) .output, div.cell:nth-child(15) .output {
flex-direction: row;
}
div.cell:nth-child(19) .output, div.cell:nth-child(21) .output , div.cell:nth-child(27) .output{
flex-direction: row;
padding:0;
}
"""
HTML('<style>{}</style>'.format(css))
# overview of raw data
display(btcusdt.head(3).style.set_caption('BTCUSD'))
display(ethbtc.head(3).style.set_caption('ETHBTC'))
We start by selecting, in each dataset, the variables we are interested in: time and price
btcusd=btcusdt[['price','time']]
eth=ethbtc[['price','time']]
An overview of the results of these selections is presented below.
display(btcusd.head(3).style.set_caption('BTCUSD'))
display(eth.head(3).style.set_caption('ETHBTC'))
We check the presence of null values:
We notice the absence of null values in both databases
display("BTCUSD", btcusd.isnull().any())
display('ETHBTC',eth.isnull().any())
The observations being irregularly distributed in time, we correct this problem by adapting the time interval to our problem (5 minutes). We also compute for each price sample observed in a time interval the highest price (hight), the lowest price (low), the average price (avg) and the median price (med)
"""
This function transforms a variable of a datafarme into a time series following a given time interval
"""
def transform_st(data, var_to_transform, ts_var, freq_final, freq_init):
# Transformation de la dataframe en pandas series temporelles
data=data.set_index(ts_var).sort_index()
data.index=pd.to_datetime(data.index,unit=freq_init)
# Transformation de la fréquence
data=data.groupby([pd.Grouper(freq=freq_final)]).agg(
hight=(var_to_transform, lambda x: x.max()),
low=(var_to_transform, lambda x: x.min()),
avg=(var_to_transform,np.mean),
med=(var_to_transform,np.median),
)
return data
# Transformation de la variable price en intervalle de 5 minutes
btcusd_df= transform_st(btcusd, 'price', 'time', freq_final='5min', freq_init='ms')
eth_df= transform_st(eth, 'price', 'time', freq_final='5min', freq_init='ms')
The result is as follows:
display(btcusd_df.head(3).style.set_caption('BTCUSD'))
display(eth_df.head(3).style.set_caption('ETHBTC'))
Some descriptive statistics of the different variables are presented below
display(btcusd_df.describe().style.set_caption('BTCUSD'))
display(eth_df.describe().style.set_caption('ETHBTC'))
var_name=btcusd_df.columns.to_list()
var_label=['Hight', 'Low', 'Mean', 'Median']
titles=['BTCUSD', 'ETHBTC']
fig, ax = plt.subplots(1,2, figsize = (15,9))
for var in range(len(var_name)):
ax[0].plot(pd.to_datetime(btcusd_df.index), var_name[var], data=btcusd_df, label=var_label[var])
ax[1].plot(pd.to_datetime(eth_df.index), var_name[var], data=eth_df, label=var_label[var])
for i in range(2):
ax[i].xaxis.set_major_formatter(mdates.DateFormatter('%Y-%m-%d %H'))
ax[i].legend()
ax[i].set_title(titles[i])
fig.autofmt_xdate()
plt.show()
The graphs above show an increasing trend in the price of Bitcoin compared to the other two cryptocurrencies. They also indicate a non stationarity in the price relationship. The results of the ADF test for stationarity confirm this result
# Test adf
def adf_test(data, lag=0):
df=pd.DataFrame(['Test Statistic','p-value'])
for var in data.columns:
if lag ==0:
adf_results=st.stattools.adfuller(data[var],autolag='AIC')
else:
adf_results=st.stattools.adfuller(data[var].diff()[lag:],autolag='AIC')
df[var]=adf_results[0:2]
df=df.set_index(df.columns[0])
df.index.name=None
return df
display(adf_test(btcusd_df).style.set_caption('BTCUSD'))
display(adf_test(eth_df).style.set_caption('ETHBTC'))
For all variables the p-value is greater than the critical threshold of 10%. We therefore accept the null hypothesis of the presence of unit root
In addition to the non-stationary nature of the series, they clearly show a non-linear trend. Recurrent neural network models of the LSTM (Long-Short Time Memory) type are adapted to the treatment of this type of problem. To implement this type of model, it is necessary to define the number of delays to consider for a good estimation. We make this choice based on the partial auto-correlations of the different stationary series by first difference.
These partial auto-correlation functions (PACF) indicate that for each of the series a lag of order 3 is sufficient to estimate the contemporaneous values. In other words, the level of the bitcoin price in the next 5 minutes can be satisfactorily explained by its variations over the last 15 (3x5) minutes.
The correlograms of the different series are shown below
fig, ax = plt.subplots(2,4, figsize = (15,10))
for i in range(len(var_name)):
fig = sm.graphics.tsa.plot_pacf(btcusd_df[var_name[i]].diff()[1:], lags=40, ax=ax[0,i], title=None)
fig = sm.graphics.tsa.plot_pacf(eth_df[var_name[i]].diff()[1:], lags=40, ax=ax[1,i], title=None)
ax[1,i].set(xlabel=var_label[i])
if i !=0:
ax[0,i].yaxis.set_ticklabels([])
ax[1,i].yaxis.set_ticklabels([])
ax[0,0].set(ylabel='BTCUSD')
ax[1,0].set(ylabel='ETHBTC')
fig.suptitle('Partial autocorrelation function', fontsize=16)
plt.show()
First, we proceed to a normalization of the data. This procedure is a good practice since it not only reduces the risk of overfitting in cases where the variance of some features may be higher than others but also speeds up the computation time.
Then, we partition the 3 groups: training data (70%) to estimate the model, validation data (20%) to optimize the hyperparameters and test data (10%) to test the performance of our algorithm.
Finally, since the LSTM model requires the organization of the input data into blocks, we group the data by 15-minutes blocks to predict the price levels in the next 5 minutes.
# Normalization of the data
from sklearn.preprocessing import MinMaxScaler
scale_btc = MinMaxScaler(feature_range=(0, 1))
scale_eth = MinMaxScaler(feature_range=(0, 1))
btcusd=scale_btc.fit_transform(btcusd_df.values)
eth=scale_eth.fit_transform(eth_df.values)
# splitting the dataset
def split_data(data,training_fold_size,validation_fold_size):
training_fold_nrows=np.floor(data.shape[0]*training_fold_size).astype(int)
val_fold_nrows=np.floor(data.shape[0]*validation_fold_size).astype(int)
# Split into train, validation and test sets
train = data[:training_fold_nrows, :]
valid = data[training_fold_nrows:val_fold_nrows+training_fold_nrows, :]
test = data[val_fold_nrows+training_fold_nrows:, :]
return train, valid, test
# We want 70% of the data to be used for training, 20% for validation and 10% for testing
training_fold_size=0.7
validation_fold_size=0.2
BTC=dict(zip(['train', 'valid', 'test'],list(split_data(btcusd,training_fold_size,validation_fold_size))))
ETH=dict(zip(['train', 'valid', 'test'],list(split_data(eth,training_fold_size,validation_fold_size))))
# Print the size of the dataset
print('BTCUSD: \nTrain data size: ==========> {} \nValidation data size: =====> {} \nTest data size: ===========> {}'.format(
BTC['train'].shape,BTC['valid'].shape,BTC['test'].shape))
print('ETHBTC: \nTrain data size: ==========> {} \nValidation data size: =====> {} \nTest data size: ===========> {}'.format(
ETH['train'].shape,ETH['valid'].shape,ETH['test'].shape))
"""
This function allows to format the data into blocks usable by the LSTM model. It takes as input a matrix
(numpy array) and a number of lags defined to build the block of explanatory variables on the one hand and the
matrix of dependent variables on the other hand (this matrix contains
as many columns as there are series).
"""
def create_blocs(data,step):
nobs=data.shape[0]
nvar=data.shape[1]
exog=data[range(step)]
endo=data[step:step+1].reshape(nvar)
start=1
end=step+1
while end<nobs-1:
lagged_values=data[range(start,end)]
labels=data[end:end+1].reshape(nvar)
exog=np.vstack((exog,lagged_values))
endo=np.vstack((endo,labels))
start+=1
end+=1
return exog.reshape(-1,step,data.shape[1]), endo
# Definition of the number of lags
step=3
# Creation of the different data sets (trainset, validation set end testset) for BTCUSDT
BTC_=dict(train=list(create_blocs(BTC['train'],step)),
valid=list(create_blocs(BTC['valid'],step)),
test=create_blocs(BTC['test'],step))
# Creation of the different data sets (trainset, validation set end testset) for ETHBTC
ETH_=dict(train=list(create_blocs(ETH['train'],step)),
valid=list(create_blocs(ETH['valid'],step)),
test=create_blocs(ETH['test'],step))
# Displaying the size of the partitions
print('BTCUSDT ===> Train data size {} | Validation data size: {} | Test data size: {}'.format(
BTC_['train'][0].shape, BTC_['train'][1].shape,
BTC_['valid'][0].shape, BTC_['valid'][1].shape,
BTC_['test'][0].shape, BTC_['test'][1].shape))
print('ETHBTC ===> Train data size {} | Validation data size: {} | Test data size: {}'.format(
ETH_['train'][0].shape, ETH_['train'][1].shape,
ETH_['valid'][0].shape, ETH_['valid'][1].shape,
ETH_['test'][0].shape, ETH_['test'][1].shape))
We set up a neural network consisting of two layers (1 hidden layer and 1 output layer), in addition to the input layer. In the hidden layer, we choose to use 128 neurons with LSTM cells and a hyperbolic tangent function (tanh) as activation function (non linear) of these neurons.
n order to limit the risk of overfitting, we introduce a droptout coefficient of 0.1. This means that 1% of the training data is randomly and temporarily removed at each iteration of the gradient descent.
To optimize the model we set up a function that compares the performances of the estimates with different batch sizes and a varying number of epochs. We use the mean square error (MSE) as a performance measure
NB: The performance of this model can be improved by optimizing the other hyperparameters (learning rate, number of neurons, number of hidden ticks, etc.)
# Definition of the non-variable arguments of our neural network
learning_rate=0.0001
batch_size = 16
neurones=128
dropout=0.1
# Function to estimate the neural network LSTM
def estimate_lstm(xtrain, ytrain, xvalid, yvalid, neurones, learning_rate,
dropout,callback=None, batch_size=32, epochs=1000, verbose=0):
model = Sequential()
optim=tf.keras.optimizers.Adam(learning_rate=learning_rate)
model.add(LSTM(units=neurones, return_sequences = False, activation='tanh',
input_shape = (xtrain.shape[1],xtrain.shape[2])))
model.add(Dropout(dropout))
model.add(Dense(units=ytrain.shape[1]))
model.compile(loss="mean_squared_error", optimizer=optim)
return model.fit(xtrain, ytrain, batch_size = batch_size, epochs = epochs, validation_data=(xvalid, yvalid),
shuffle=False, callbacks=callback, verbose=verbose)
#model.summary
# This function allows us to determine, for a given model, the optimal batch size and the number of epochs.
hyperparam_opt=dict(btc=dict(data=BTC_, batch=[]),eth=dict(data=ETH_, batch=[]))
callback = tf.keras.callbacks.EarlyStopping(monitor="val_loss", min_delta=0, patience=7)
for asset in tqdm(hyperparam_opt.keys(), desc= 'Modeles'):
data=hyperparam_opt[asset]['data']
for batch_size in tqdm([16,24,32,64,128], desc= 'Batch'):
model=estimate_lstm(data['train'][0], data['train'][1],
data['valid'][0], data['valid'][1],
neurones, learning_rate, dropout,callback, batch_size, verbose=0)
loss=model.history['val_loss'][-1]
if hyperparam_opt[asset]['batch']==[]:
hyperparam_opt[asset]['batch']=[batch_size, model.history['val_loss'][-1],
len(model.history['val_loss'])]
elif hyperparam_opt[asset]['batch'][1]>loss:
hyperparam_opt[asset]['batch']=[batch_size, model.history['val_loss'][-1],
len(model.history['val_loss'])]
# Respective display of the optimal batch size and number of epochs for the prediction of the BTC/USDT price ratio
btc_opti=hyperparam_opt['btc']['batch']
print('BTCUSDT Model: \n\nOptilmal batch size: ===> {} \nOptilmal number of epochs: ===> {}'.format(
btc_opti[0], btc_opti[2]))
# Respective display of the optimal batch size and number of epochs for the prediction of the ETH/BTC price ratio
eth_opti=hyperparam_opt['eth']['batch']
print('ETHBTC Model: \n\nOptilmal batch size: ===> {} \nOptilmal number of epochs: ===> {}'.format(
eth_opti[0], eth_opti[2]))
# Estimation of the BTC/USDT model with optimal hyperparameters
print("Running....", end='\r')
data=BTC_
opt_par=hyperparam_opt['btc']['batch']
btc_model=estimate_lstm(data['train'][0], data['train'][1],
data['valid'][0], data['valid'][1],
neurones, learning_rate, dropout, batch_size=opt_par[0], epochs=opt_par[2], verbose=0)
print("Terminated")
# Estimation of the ETH/BTC model with optimal hyperparameters
print("Running....", end='\r')
data=ETH_
opt_par=hyperparam_opt['eth']['batch']
eth_model=estimate_lstm(data['train'][0], data['train'][1],
data['valid'][0], data['valid'][1],
neurones, learning_rate, dropout, batch_size=opt_par[0], epochs=opt_par[2], verbose=0)
print("Terminated")
# Evolution of the loss function as a function of the number of epochs, by data set and by price ratio
fig, ax = plt.subplots(1,2, figsize = (15,5))
hist=[btc_model, eth_model]
title=['BTCUSD', 'ETHBTC']
for i in range(2):
ax[i].plot(hist[i].history['loss'], label='train')
ax[i].plot(hist[i].history['val_loss'], label='test')
ax[i].legend()
ax[i].set_title(title[i])
plt.show()
# Display of MSE values for each model
btc_loss = btc_model.history['val_loss'][-1]
eth_loss = eth_model.history['val_loss'][-1]
print('BTCUSF loss: ====> {} \nETHBTC loss: ====> {}'.format(btc_loss, eth_loss))
We test the prediction quality of this algorithm with the test data
"""
Forecasting function:
This function allows to make price predictions from a database (pandas dataframe or numpy array), from an
estimated model. In this context, the model will be either the model intended for the estimation of the btcusdt,
or the one related to the ethbtc.
"""
def forcast_btc(data,modele,step):
scale = MinMaxScaler(feature_range=(0, 1))
data=scale.fit_transform(data.values)
x,y=create_blocs(data,step)
pred= modele.model.predict(x)
pred=scale.inverse_transform(pred)
return pred
# Realization of prediction operations
btc_test_shape=BTC_['test'][1].shape[0]
eth_test_shape=ETH_['test'][1].shape[0]
result_eth=forcast_btc(eth_df.iloc[-eth_test_shape:,],eth_model,step=3)
result_btc=forcast_btc(btcusd_df.iloc[-btc_test_shape:,],btc_model,step=3)
# Function to make a graphical representation of the prediction results
def plot_prediction_result(data_df,result_test,title):
predicted=pd.DataFrame(result_test,index=data_df.iloc[-result_test.shape[0]:].index,columns=data_df.columns)
xabs=pd.to_datetime(data_df.index)
x_pred=pd.to_datetime(predicted.index)
positions=[(1,1),(1,2),(2,1),(2,2)]
Vars=predicted.columns.to_list()
# Layout creation
fig = make_subplots(
rows=2, cols=2,
shared_xaxes=True,
vertical_spacing=0.05,
specs=[[{"type": "scatter"},{"type": "scatter"}],
[{"type": "scatter"},{"type": "scatter"}],
],
subplot_titles=("Hight", "Low", "Mean", "Median")
)
for var in Vars:
legende=True if Vars.index(var)==0 else False
fig.add_trace(
go.Scatter(
x=xabs,
y=data_df[var],
mode="lines",
name='Actual',
line=dict(color='royalblue'),
showlegend= legende,
),
row=positions[Vars.index(var)][0], col=positions[Vars.index(var)][1]
)
fig.add_trace(
go.Scatter(
x=x_pred,
y=predicted[var],
mode="lines",
name="Predicted",
line=dict(color='red'),
showlegend= legende,
),
row=positions[Vars.index(var)][0], col=positions[Vars.index(var)][1]
)
fig.update_layout(
height=800,
showlegend=True,
title_text=title,
)
return fig.show(renderer="notebook")
# Prediction of the BTC/USDT price ratio over the next 5 minutes
plot_prediction_result(btcusd_df,result_btc,title="Bitcoin prediction (BTCUSD)")
# Prediction of the ETH/BTC price ratio over the next 5 minutes
plot_prediction_result(eth_df,result_eth,title="Bitcoin prediction (ETHBTC)")